With the growing need for an online presence in politics, the accounts of those running for public office become more and more important. Here we are looking at the defining characteristics of the two presidential candidates for the 2020 election.
The goal of this analysis is to
#import stuff
import pandas as pd
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import spacy
from textstat.textstat import textstatistics, legacy_round, textstat
from collections import Counter
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
Our Tweets dataset is from Kaggle.
Here we are cleaning the data by removing some of the stopwords. Stop words include words in the english stopwords library and some of the ones we found in the Tweets.
Some problems with our dataset we fixed and considered are
We will talk about how we fixed these probems throughout this notebook.
# read data
joe_data = pd.read_csv('data/JoeBidenTweets.csv')
donald_data = pd.read_csv('data/realdonaldtrump.csv')
joe_tweets = joe_data['tweet']
donald_tweets = donald_data['content']
cachedStopWords = stopwords.words('english')
cachedStopWords.append('@')
cachedStopWords.append('"')
cachedStopWords.append('i')
cachedStopWords.append('realdonaldtrump')
cachedStopWords.append('#')
cachedStopWords.append('&')
cachedStopWords.append('the')
cachedStopWords.append('…')
cachedStopWords.append('-')
cachedStopWords.append('—')
cachedStopWords.append('it’s')
cachedStopWords.append('get')
cachedStopWords.append('trump')
cachedStopWords.append('make')
cachedStopWords.append('president')
cachedStopWords.append('.')
top_joe_words = Counter([word for word in " ".join(joe_tweets).lower().split() if word not in cachedStopWords])
top_donald_words = Counter([word for word in " ".join(donald_tweets).lower().split() if word not in cachedStopWords])
features = 'problems, today, protecting, seniors, important, healthcare, people, point, trump, thanks, mexico, polls, illegal, closing, immigration, family, clinton, biden'
for feature in features.split(', '):
print(feature + ": ")
print(top_joe_words.get(feature))
print(top_donald_words.get(feature))
Here we want to take a look at some of the general statistics each candidate has in regards to their Tweets.
These statistics are
Joe Biden Tweet dates
# joe dates
joe_dates = joe_data['timestamp']
joe_dates = joe_dates.astype("datetime64")
joe_dates.groupby([joe_dates.dt.year]).count().plot(kind="bar", title='Biden Tweet Count')
Donald Trump Tweet dates
# donald dates
donald_dates = donald_data['date']
donald_dates = donald_dates.astype("datetime64")
donald_dates.groupby([donald_dates.dt.year]).count().plot(kind="bar", title='Trump Tweet Count')
Joe Biden wordcloud
# word cloud
# create mask
joe_mask = np.array(Image.open('data/wordcloud/joeshape.jpg'))
morestopwords = set(STOPWORDS)
morestopwords.add('https')
morestopwords.add('com')
morestopwords.add('html')
morestopwords.add('re')
morestopwords.add('realDonaldTrump')
morestopwords.add('bit')
morestopwords.add('ly')
morestopwords.add('s')
morestopwords.add('t')
morestopwords.add('pic')
morestopwords.add('twitter')
stopwords = text.ENGLISH_STOP_WORDS.union(morestopwords)
def blue_color_func(word, font_size, position,orientation,random_state=None, **kwargs):
return("hsl(230,100%%, %d%%)" % np.random.randint(49,51))
wc = WordCloud(width=800, height=400, background_color="white", max_words=1000, mask=joe_mask, stopwords=stopwords)
wc.generate(' '.join(joe_tweets))
wc.recolor(color_func = blue_color_func)
plt.figure(figsize=(8,6), dpi=2000)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.savefig('images/joecloud.png')